{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 从疝气病症预测病马的死亡率"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**说明:**\n",
    "\n",
    "将 `horseColicTraining.txt` 和 `horseColicTest.txt` 放在当前目录下。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 定义 Sigmoid 函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def sigmoid(inX):\n",
    "    return 1.0 / (1 + np.exp(-inX))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 定义一般的梯度提升算法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def gradAscent(dataMatIn, classLabels):\n",
    "    dataMatrix = np.mat(dataMatIn)             #convert to NumPy matrix\n",
    "    labelMat = np.mat(classLabels).transpose() #convert to NumPy matrix\n",
    "    m, n = np.shape(dataMatrix)\n",
    "    alpha = 0.001\n",
    "    maxCycles = 500\n",
    "    weights = np.ones((n, 1))\n",
    "    for k in range(maxCycles):              #heavy on matrix operations\n",
    "        h = sigmoid(dataMatrix*weights)     #matrix mult\n",
    "        error = (labelMat - h)              #vector subtraction\n",
    "        weights = weights + alpha * dataMatrix.transpose()* error #matrix mult\n",
    "    return weights"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 定义随机梯度上升算法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def stocGradAscent0(dataMatrix, classLabels):\n",
    "    m, n = np.shape(dataMatrix)\n",
    "    alpha = 0.01\n",
    "    weights = np.ones(n)   #initialize to all ones\n",
    "    for i in range(m):\n",
    "        h = sigmoid(sum(dataMatrix[i]*weights))\n",
    "        error = classLabels[i] - h\n",
    "        weights = weights + alpha * error * dataMatrix[i]\n",
    "    return weights"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 定义改进的随机梯度上升算法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def stocGradAscent1(dataMatrix, classLabels, numIter=150):\n",
    "    m, n = np.shape(dataMatrix)\n",
    "    weights = np.ones(n)   #initialize to all ones\n",
    "    for j in range(numIter):\n",
    "        dataIndex = list(range(m))\n",
    "        for i in range(m):\n",
    "            alpha = 4/(1.0+j+i)+0.0001    #apha decreases with iteration, does not\n",
    "            randIndex = int(np.random.uniform(0, len(dataIndex)))#go to 0 because of the constant\n",
    "            h = sigmoid(sum(dataMatrix[randIndex]*weights))\n",
    "            error = classLabels[randIndex] - h\n",
    "            weights = weights + alpha * error * dataMatrix[randIndex]\n",
    "            del(dataIndex[randIndex])\n",
    "    return weights"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 从疝气病症预测病马的死亡率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def classifyVector(inX, weights):\n",
    "    prob = sigmoid(sum(inX*weights))\n",
    "    if prob > 0.5: return 1.0\n",
    "    else: return 0.0\n",
    "\n",
    "def colicTest():\n",
    "    frTrain = open('./horseColicTraining.txt'); frTest = open('./horseColicTest.txt')\n",
    "    trainingSet = []; trainingLabels = []\n",
    "    for line in frTrain.readlines():\n",
    "        currLine = line.strip().split('\\t')\n",
    "        lineArr = []\n",
    "        for i in range(21):\n",
    "            lineArr.append(float(currLine[i]))\n",
    "        trainingSet.append(lineArr)\n",
    "        trainingLabels.append(float(currLine[21]))\n",
    "    trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 1000)\n",
    "    errorCount = 0; numTestVec = 0.0\n",
    "    for line in frTest.readlines():\n",
    "        numTestVec += 1.0\n",
    "        currLine = line.strip().split('\\t')\n",
    "        lineArr = []\n",
    "        for i in range(21):\n",
    "            lineArr.append(float(currLine[i]))\n",
    "        if int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[21]):\n",
    "            errorCount += 1\n",
    "    errorRate = (float(errorCount)/numTestVec)\n",
    "    print(\"the error rate of this test is: %f\" % errorRate)\n",
    "    return errorRate\n",
    "\n",
    "def multiTest():\n",
    "    numTests = 10; errorSum = 0.0\n",
    "    for k in range(numTests):\n",
    "        errorSum += colicTest()\n",
    "    print(\"after %d iterations the average error rate is: %f\" % (numTests, errorSum/float(numTests)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\ProgramLoc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: RuntimeWarning: overflow encountered in exp\n",
      "  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the error rate of this test is: 0.328358\n",
      "the error rate of this test is: 0.373134\n",
      "the error rate of this test is: 0.402985\n",
      "the error rate of this test is: 0.313433\n",
      "the error rate of this test is: 0.388060\n",
      "the error rate of this test is: 0.402985\n",
      "the error rate of this test is: 0.343284\n",
      "the error rate of this test is: 0.223881\n",
      "the error rate of this test is: 0.298507\n",
      "the error rate of this test is: 0.373134\n",
      "after 10 iterations the average error rate is: 0.344776\n"
     ]
    }
   ],
   "source": [
    "multiTest()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
